{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Pre-Processing "
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Preproccessing is an important step in the data mining process and machine learning. It is used for \"garbage in, garbage out.\" In machine learning and data mining, the data gathering methods are often loosely controlled, resulting in out-of-range values, impossible data combinations, missing values, etc."
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "\n",
        "import warnings\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "# fix_yahoo_finance is used to fetch data \n",
        "import fix_yahoo_finance as yf\n",
        "yf.pdr_override()"
      ],
      "outputs": [],
      "execution_count": 1,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# input\n",
        "symbol = 'AMD'\n",
        "start = '2007-01-01'\n",
        "end = '2018-12-31'\n",
        "\n",
        "# Read data \n",
        "dataset = yf.download(symbol,start,end)\n",
        "\n",
        "# View Columns\n",
        "dataset.head()"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[*********************100%***********************]  1 of 1 downloaded\n"
          ]
        },
        {
          "output_type": "execute_result",
          "execution_count": 2,
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Open</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Close</th>\n",
              "      <th>Adj Close</th>\n",
              "      <th>Volume</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Date</th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>2007-01-03</th>\n",
              "      <td>20.080000</td>\n",
              "      <td>20.400000</td>\n",
              "      <td>19.350000</td>\n",
              "      <td>19.520000</td>\n",
              "      <td>19.520000</td>\n",
              "      <td>28350300</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-04</th>\n",
              "      <td>19.660000</td>\n",
              "      <td>19.860001</td>\n",
              "      <td>19.320000</td>\n",
              "      <td>19.790001</td>\n",
              "      <td>19.790001</td>\n",
              "      <td>23652500</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-05</th>\n",
              "      <td>19.540001</td>\n",
              "      <td>19.910000</td>\n",
              "      <td>19.540001</td>\n",
              "      <td>19.709999</td>\n",
              "      <td>19.709999</td>\n",
              "      <td>15902400</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-08</th>\n",
              "      <td>19.709999</td>\n",
              "      <td>19.860001</td>\n",
              "      <td>19.370001</td>\n",
              "      <td>19.469999</td>\n",
              "      <td>19.469999</td>\n",
              "      <td>15814800</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-09</th>\n",
              "      <td>19.450001</td>\n",
              "      <td>19.709999</td>\n",
              "      <td>19.370001</td>\n",
              "      <td>19.650000</td>\n",
              "      <td>19.650000</td>\n",
              "      <td>14494200</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                 Open       High        Low      Close  Adj Close    Volume\n",
              "Date                                                                       \n",
              "2007-01-03  20.080000  20.400000  19.350000  19.520000  19.520000  28350300\n",
              "2007-01-04  19.660000  19.860001  19.320000  19.790001  19.790001  23652500\n",
              "2007-01-05  19.540001  19.910000  19.540001  19.709999  19.709999  15902400\n",
              "2007-01-08  19.709999  19.860001  19.370001  19.469999  19.469999  15814800\n",
              "2007-01-09  19.450001  19.709999  19.370001  19.650000  19.650000  14494200"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 2,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)\n",
        "dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)\n",
        "dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)\n",
        "dataset['Returns'] = dataset['Adj Close'].pct_change()\n",
        "dataset = dataset.dropna()\n",
        "dataset.head()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 3,
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Open</th>\n",
              "      <th>High</th>\n",
              "      <th>Low</th>\n",
              "      <th>Close</th>\n",
              "      <th>Adj Close</th>\n",
              "      <th>Volume</th>\n",
              "      <th>Increase_Decrease</th>\n",
              "      <th>Buy_Sell_on_Open</th>\n",
              "      <th>Buy_Sell</th>\n",
              "      <th>Returns</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Date</th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>2007-01-04</th>\n",
              "      <td>19.660000</td>\n",
              "      <td>19.860001</td>\n",
              "      <td>19.320000</td>\n",
              "      <td>19.790001</td>\n",
              "      <td>19.790001</td>\n",
              "      <td>23652500</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0.013832</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-05</th>\n",
              "      <td>19.540001</td>\n",
              "      <td>19.910000</td>\n",
              "      <td>19.540001</td>\n",
              "      <td>19.709999</td>\n",
              "      <td>19.709999</td>\n",
              "      <td>15902400</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>-0.004043</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-08</th>\n",
              "      <td>19.709999</td>\n",
              "      <td>19.860001</td>\n",
              "      <td>19.370001</td>\n",
              "      <td>19.469999</td>\n",
              "      <td>19.469999</td>\n",
              "      <td>15814800</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>-0.012177</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-09</th>\n",
              "      <td>19.450001</td>\n",
              "      <td>19.709999</td>\n",
              "      <td>19.370001</td>\n",
              "      <td>19.650000</td>\n",
              "      <td>19.650000</td>\n",
              "      <td>14494200</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>0.009245</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2007-01-10</th>\n",
              "      <td>19.639999</td>\n",
              "      <td>20.020000</td>\n",
              "      <td>19.500000</td>\n",
              "      <td>20.010000</td>\n",
              "      <td>20.010000</td>\n",
              "      <td>19783200</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>0.018321</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                 Open       High        Low      Close  Adj Close    Volume  \\\n",
              "Date                                                                          \n",
              "2007-01-04  19.660000  19.860001  19.320000  19.790001  19.790001  23652500   \n",
              "2007-01-05  19.540001  19.910000  19.540001  19.709999  19.709999  15902400   \n",
              "2007-01-08  19.709999  19.860001  19.370001  19.469999  19.469999  15814800   \n",
              "2007-01-09  19.450001  19.709999  19.370001  19.650000  19.650000  14494200   \n",
              "2007-01-10  19.639999  20.020000  19.500000  20.010000  20.010000  19783200   \n",
              "\n",
              "            Increase_Decrease  Buy_Sell_on_Open  Buy_Sell   Returns  \n",
              "Date                                                                 \n",
              "2007-01-04                  0                 0         0  0.013832  \n",
              "2007-01-05                  0                 1         0 -0.004043  \n",
              "2007-01-08                  0                 0         1 -0.012177  \n",
              "2007-01-09                  1                 1         1  0.009245  \n",
              "2007-01-10                  1                 1         1  0.018321  "
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 3,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "X = dataset[['Open', 'High', 'Low', 'Volume']].values\n",
        "y = dataset['Adj Close'].values"
      ],
      "outputs": [],
      "execution_count": 4,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Rescaling Data"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Rescaling data is multiplying each member of a data set by a constant k; that is to say, transforming each number x to f(X), where f(x) = kx, and k and x are both real numbers. Rescaling will change the spread of your data as well as the position of your data points. (https://www.statisticshowto.datasciencecentral.com/what-is-rescaling-data/)"
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import MinMaxScaler"
      ],
      "outputs": [],
      "execution_count": 23,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "scaler=MinMaxScaler(feature_range=(0,1))\n",
        "rescaledX=scaler.fit_transform(X)\n",
        "np.set_printoptions(precision=3) #Setting precision for the output\n",
        "rescaledX[0:5,:]"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 25,
          "data": {
            "text/plain": [
              "array([[ 0.572,  0.56 ,  0.579,  0.073],\n",
              "       [ 0.568,  0.561,  0.586,  0.049],\n",
              "       [ 0.573,  0.56 ,  0.581,  0.049],\n",
              "       [ 0.565,  0.555,  0.581,  0.045],\n",
              "       [ 0.571,  0.565,  0.585,  0.061]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 25,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Standardizing Data"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Standardizing Data: A standardized variable (sometimes called a z-score or a standard score) is a variable that has been rescaled to have a mean of zero and a standard deviation of one. (https://stats.idre.ucla.edu/stata/faq/how-do-i-standardize-variables-in-stata/)"
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import StandardScaler"
      ],
      "outputs": [],
      "execution_count": 7,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "scaler=StandardScaler().fit(X)\n",
        "rescaledX=scaler.transform(X)\n",
        "rescaledX[0:5,:]"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 8,
          "data": {
            "text/plain": [
              "array([[ 2.45048633,  2.40507386,  2.48078494, -0.2809095 ],\n",
              "       [ 2.42666905,  2.41477762,  2.5256097 , -0.53851508],\n",
              "       [ 2.46041008,  2.40507386,  2.49097254, -0.54142682],\n",
              "       [ 2.40880595,  2.37596163,  2.49097254, -0.58532225],\n",
              "       [ 2.44651656,  2.4361263 ,  2.51745957, -0.40952117]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 8,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Normalizing Data"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Normalizing Data: normalize your data between the range of 0 and 1. "
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import Normalizer"
      ],
      "outputs": [],
      "execution_count": 9,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "scaler=Normalizer().fit(X)\n",
        "normalizedX=scaler.transform(X)\n",
        "normalizedX[0:5,:]"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 10,
          "data": {
            "text/plain": [
              "array([[  8.31201776e-07,   8.39657584e-07,   8.16826974e-07,\n",
              "          1.00000000e+00],\n",
              "       [  1.22874541e-06,   1.25201227e-06,   1.22874541e-06,\n",
              "          1.00000000e+00],\n",
              "       [  1.24630087e-06,   1.25578578e-06,   1.22480215e-06,\n",
              "          1.00000000e+00],\n",
              "       [  1.34191615e-06,   1.35985422e-06,   1.33639670e-06,\n",
              "          1.00000000e+00],\n",
              "       [  9.92761484e-07,   1.01196975e-06,   9.85684823e-07,\n",
              "          1.00000000e+00]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 10,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Binarizing Data"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Binarization Data is the process of transforming data features of any entity into vectors of binary numbers to make classifier algorithms more efficient. (https://deepai.org/machine-learning-glossary-and-terms/binarization)"
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import Binarizer"
      ],
      "outputs": [],
      "execution_count": 11,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "binarizer=Binarizer(threshold=0.0).fit(X)\n",
        "binaryX=binarizer.transform(X)\n",
        "binaryX[0:5,:]"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 12,
          "data": {
            "text/plain": [
              "array([[ 1.,  1.,  1.,  1.],\n",
              "       [ 1.,  1.,  1.,  1.],\n",
              "       [ 1.,  1.,  1.,  1.],\n",
              "       [ 1.,  1.,  1.,  1.],\n",
              "       [ 1.,  1.,  1.,  1.]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 12,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Mean Removal"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Mean Removal is a process that remove the mean from each column or feature to center it on zero."
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import scalemean"
      ],
      "outputs": [],
      "execution_count": 13,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "data_standardized=scale(dataset)\n",
        "data_standardized.mean(axis=0)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 14,
          "data": {
            "text/plain": [
              "array([  1.60042749e-16,  -2.07114146e-16,  -8.94356541e-17,\n",
              "         1.88285587e-17,   1.88285587e-17,   5.64856762e-17,\n",
              "         3.29499778e-17,  -2.35356984e-18,   6.58999556e-17,\n",
              "         2.35356984e-18])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 14,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "data_standardized.std(axis=0)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 15,
          "data": {
            "text/plain": [
              "array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 15,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## One Hot Encoding"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction. (https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f)"
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import OneHotEncoder"
      ],
      "outputs": [],
      "execution_count": 16,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "encoder=OneHotEncoder()\n",
        "encoder.fit(X)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 17,
          "data": {
            "text/plain": [
              "OneHotEncoder(categorical_features=None, categories=None,\n",
              "       dtype=<class 'numpy.float64'>, handle_unknown='error',\n",
              "       n_values='auto', sparse=True)"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 17,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Label Encoding"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Label Encoding is for data that has categorical variables and convert data into numbers."
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import LabelEncoder"
      ],
      "outputs": [],
      "execution_count": 18,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "label_encoder=LabelEncoder()\n",
        "input_classes=['Apple','Intel','Microsoft','Google','Tesla'] # We will use company names\n",
        "label_encoder.fit(input_classes)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 19,
          "data": {
            "text/plain": [
              "LabelEncoder()"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 19,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "for i,companies in enumerate(label_encoder.classes_):\n",
        "    print(companies,'-->',i)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Apple --> 0\n",
            "Google --> 1\n",
            "Intel --> 2\n",
            "Microsoft --> 3\n",
            "Tesla --> 4\n"
          ]
        }
      ],
      "execution_count": 20,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "labels=['Apple','Intel','Microsoft']\n",
        "label_encoder.transform(labels)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 21,
          "data": {
            "text/plain": [
              "array([0, 2, 3], dtype=int64)"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 21,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "label_encoder.inverse_transform(label_encoder.transform(labels))"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 22,
          "data": {
            "text/plain": [
              "array(['Apple', 'Intel', 'Microsoft'],\n",
              "      dtype='<U9')"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 22,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "##  DictVectorizor"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "DictVectorizor is used for data that has labels and numbers. In addition, DictVectorizor extract data."
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.feature_extraction import DictVectorizer"
      ],
      "outputs": [],
      "execution_count": 31,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "companies = [{'Apple':180.25,'Intel':45.30,'Microsoft':30.26,'Google':203.75,'Tesla':302.18}] # We will use company names\n",
        "vec = DictVectorizer()"
      ],
      "outputs": [],
      "execution_count": 36,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "vec.fit_transform(companies).toarray()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 37,
          "data": {
            "text/plain": [
              "array([[ 180.25,  203.75,   45.3 ,   30.26,  302.18]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 37,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "vec.get_feature_names()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 38,
          "data": {
            "text/plain": [
              "['Apple', 'Google', 'Intel', 'Microsoft', 'Tesla']"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 38,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Polynomial Features"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Polynomial Features is used to generate polynomial and interaction features. Also, it generate a new data of feature matrixx that is consist of all polynomial combinations of the features with degree less than or equal to specified degree."
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import PolynomialFeatures"
      ],
      "outputs": [],
      "execution_count": 26,
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "poly = PolynomialFeatures(2)\n",
        "poly.fit_transform(X)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 27,
          "data": {
            "text/plain": [
              "array([[  1.000e+00,   1.966e+01,   1.986e+01, ...,   3.733e+02,\n",
              "          4.570e+08,   5.594e+14],\n",
              "       [  1.000e+00,   1.954e+01,   1.991e+01, ...,   3.818e+02,\n",
              "          3.107e+08,   2.529e+14],\n",
              "       [  1.000e+00,   1.971e+01,   1.986e+01, ...,   3.752e+02,\n",
              "          3.063e+08,   2.501e+14],\n",
              "       ..., \n",
              "       [  1.000e+00,   1.743e+01,   1.774e+01, ...,   2.703e+02,\n",
              "          1.831e+09,   1.240e+16],\n",
              "       [  1.000e+00,   1.753e+01,   1.831e+01, ...,   2.938e+02,\n",
              "          1.872e+09,   1.193e+16],\n",
              "       [  1.000e+00,   1.815e+01,   1.851e+01, ...,   3.186e+02,\n",
              "          1.512e+09,   7.180e+15]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 27,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "poly = PolynomialFeatures(interaction_only=True)\n",
        "poly.fit_transform(X)"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 28,
          "data": {
            "text/plain": [
              "array([[  1.000e+00,   1.966e+01,   1.986e+01, ...,   3.837e+02,\n",
              "          4.697e+08,   4.570e+08],\n",
              "       [  1.000e+00,   1.954e+01,   1.991e+01, ...,   3.890e+02,\n",
              "          3.166e+08,   3.107e+08],\n",
              "       [  1.000e+00,   1.971e+01,   1.986e+01, ...,   3.847e+02,\n",
              "          3.141e+08,   3.063e+08],\n",
              "       ..., \n",
              "       [  1.000e+00,   1.743e+01,   1.774e+01, ...,   2.916e+02,\n",
              "          1.976e+09,   1.831e+09],\n",
              "       [  1.000e+00,   1.753e+01,   1.831e+01, ...,   3.138e+02,\n",
              "          2.000e+09,   1.872e+09],\n",
              "       [  1.000e+00,   1.815e+01,   1.851e+01, ...,   3.304e+02,\n",
              "          1.568e+09,   1.512e+09]])"
            ]
          },
          "metadata": {}
        }
      ],
      "execution_count": 28,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Imputer"
      ],
      "metadata": {}
    },
    {
      "cell_type": "markdown",
      "source": [
        "Imputer (impute missing values with means) - it replaces the missing values with mean value in the columns or features data. "
      ],
      "metadata": {}
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import Imputer"
      ],
      "outputs": [],
      "execution_count": 29,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    },
    {
      "cell_type": "code",
      "source": [
        "imputer = Imputer()\n",
        "print(imputer.fit_transform(X, y))"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[[  1.966e+01   1.986e+01   1.932e+01   2.365e+07]\n",
            " [  1.954e+01   1.991e+01   1.954e+01   1.590e+07]\n",
            " [  1.971e+01   1.986e+01   1.937e+01   1.581e+07]\n",
            " ..., \n",
            " [  1.743e+01   1.774e+01   1.644e+01   1.114e+08]\n",
            " [  1.753e+01   1.831e+01   1.714e+01   1.092e+08]\n",
            " [  1.815e+01   1.851e+01   1.785e+01   8.473e+07]]\n"
          ]
        }
      ],
      "execution_count": 30,
      "metadata": {
        "collapsed": false,
        "outputHidden": false,
        "inputHidden": false
      }
    }
  ],
  "metadata": {
    "kernel_info": {
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "file_extension": ".py",
      "version": "3.5.5",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3"
    },
    "kernelspec": {
      "name": "python3",
      "language": "python",
      "display_name": "Python 3"
    },
    "nteract": {
      "version": "0.12.2"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}